imported libraries with alies name
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import ydata_profiling
from ydata_profiling import ProfileReport
import pygwalker as pg
#Importing the url of the source websites from which we extract the data
Url = 'https://www.shiksha.com/engineering/colleges/b-tech-colleges-india?rf=searchWidget&landing=ctp'
#by requests module , asked for permission to scrap the data using requests.get
Scrap_Page = requests.get(Url,headers = {'User-Agent':'Mozilla/5.0'})
#after requests, checked the status code of request. if it is yes , then it shows 200 . and if it is not then it will show 403
Scrap_Page.status_code
200
#As we know that we need lists to create a dataframe, so first we create an empty lists
College_Name = [] #empty list of college names
Ratings = [] #empty list of ratings
Courses_offered = [] #empty list of courses offered
Annual_Package = [] # empty list of annual package
Fee_Range = [] #empty list of fee range
Location = [] #empty list of location of college
College_Type = [] #empty list of type of college
Entrance_Exams = [] #empty list of entrance exam for entry in college
#extracted the required tags and classs of the data and created soup for the required data and putted into for loop
#As we need multiple pages to scrap through a single website, so we putted the Url into the for loop in range of required pages
for i in range(1 ,25): # as we need 25 pages to scrap
Url = f'https://www.shiksha.com/engineering/colleges/b-tech-colleges-india-{i}?rf=searchWidget&landing=ctp'
Scrap_Page = requests.get(Url,headers = {'User-Agent':'Mozilla/5.0'})
Soup = BeautifulSoup(Scrap_Page.text, 'html.parser')
Name_Soup = Soup.find_all('div' , attrs = {'class':'c8ff'})
Course_Soup = Soup.find_all('a' , attrs = {'class':'_9865 ripple dark'})
Multiple_Soup = Soup.find_all('div' , attrs = {'class':'cd4f _5c64 contentColumn_2'})
City_Soup = Soup.find_all('span' , attrs = {'class':'_5588'})
Type_Soup = Soup.find_all('div' , attrs = {'class':'edfa'})
#After data scrapping , we need to add the data in a list so we are appending the empty lists which already created
for i in Name_Soup:
College_Name.append(i.text)#by using loop , extracted the data and added into empty lists
for i in City_Soup:
Location.append(i.text) #using for loop , extracted the data and added into empty lists
for i in Course_Soup:
Courses_offered.append(i.text) # by using for loop , extracted the data and added into empty lists
for i in Multiple_Soup:
z = i.text
z_= re.findall('Range₹\d+' , z) #here we used 're' library to extract the useful data
if z_:
Fee_Range.append(z_[0]) #egex module , we added the fee range data to empty list
else:
Fee_Range.append(np.nan)
for i in Multiple_Soup:
a = i.text
f = re.findall('Courses(\d\.\d)' , a) # again we use 're' to do the same
if f:
Ratings.append(f[0])#dded to the empty list of Ratings
else:
Ratings.append(np.nan)
for i in Multiple_Soup:
y = i.text
y_ = re.findall('Package₹\d+' , y) #here also used 're' module to extract the useful data
if y_:
Annual_Package.append(y_[0])# using 're' , added the data to empty list
else:
Annual_Package.append(np.nan)
for i in Multiple_Soup:
x = i.text
x_=re.findall( r'Exams Accepted(.*?)Fees Range' , x) #again we use 're' to do extract the useful data
Entrance_Exams.append(x_[0]) # using 're' , added the data to empty list
for i in Type_Soup:
h = i.text
h_ = re.findall('Govt|Pvt' , h) # here we use 'or' condition in 're' and extracted the useful data
if h_:
College_Type.append(h_[0]) #g list attribute , added the extracted data to empty list
else:
College_Type.append(np.nan)
# Now checking the length and content of required lists we created
#first we are checking length using 'len' function
print(len(College_Name))
print(len(Ratings))
print(len(Courses_offered))
print(len(Annual_Package))
print(len(Fee_Range))
print(len(Location))
print(len(College_Type))
print(len(Entrance_Exams))
511 511 833 511 511 511 833 511
#After checking the length , we observed that courses offered and college type tooks extra data
#so we are selecting(approved and checked)
Courses_Offered = Courses_offered[0:511]
College_Type1 = College_Type[0:511]
# Now we created a Dataframe named as Data using pandas library through our lists
#for this we need to create dictionary first
Data = {'College Name':College_Name ,
'College Location':Location ,
'Rating':Ratings ,
'College Type':College_Type1 ,
'Entrance Exams':Entrance_Exams ,
'Expected Fee':Fee_Range ,
'Average Package':Annual_Package ,
'Courses Offered':Courses_Offered}
#now creating Data Frame
College_Data = pd.DataFrame(Data)
College_Data
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 0 | IIT Bombay - Indian Institute of Technology | Mumbai | 4.6 | Govt | JEE MainJEE AdvancedTotal | Range₹8 | Package₹8 | 24 Courses |
| 1 | IIT Delhi - Indian Institute of Technology | Delhi | 4.6 | Govt | JEE MainJEE AdvancedTotal | Range₹8 | Package₹15 | 18 Courses |
| 2 | VIT Vellore | Vellore | 4.2 | Pvt | TNEAVITEEECBSE 12thISCTamilnadu 12th +3 Total | Range₹5 | Package₹7 | 22 Courses |
| 3 | IIT Madras - Indian Institute of Technology | Chennai | 4.6 | NaN | JEE MainJEE AdvancedTotal | Range₹5 | Package₹15 | 25 Courses |
| 4 | DTU - Delhi Technological University | Delhi | 4.3 | NaN | JEE MainDASA UGJAC DelhiCBSE 12thISC +3 Total | Range₹3 | Package₹9 | 28 Courses |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 506 | JSPM’s Bhivarabai Sawant Institute of Technolo... | Pune | 4.3 | Govt | JEE MainMHT CETTotal | Range₹2 | NaN | 8 Courses |
| 507 | Bansal College of Engineering | Madhya Pradesh - Other | 4.1 | Pvt | JEE MainMP BETotal | Range₹2 | NaN | 6 Courses |
| 508 | University Institute of Technology, Jhabua - R... | Madhya Pradesh - Other | 3.8 | NaN | JEE MainTotal | NaN | NaN | 7 Courses |
| 509 | MET League of Colleges, Bhujbal Knowledge City... | Nashik | 4.2 | NaN | JEE MainMHT CETTotal | Range₹4 | NaN | 4 Courses |
| 510 | RMD Sinhgad School of Engineering | Pune | 3.7 | Govt | JEE MainMHT CETTotal | Range₹3 | NaN | 11 Courses |
511 rows × 8 columns
#As we can see that we need to refine the data as it contains some unuseful data
#so we are doing cleaning data for all the required columns by using pandas , type casting and by using by useful functions
College_Data["Courses Offered"] = College_Data["Courses Offered"].str.replace("Courses|Course"," ")
College_Data["Expected Fee"] = College_Data["Expected Fee"].str.replace("Range₹"," ")
College_Data["Average Package"] = College_Data["Average Package"].str.replace("Package₹"," ")
College_Data ##Again checking the data frame after applying cleaning process
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 0 | IIT Bombay - Indian Institute of Technology | Mumbai | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 8 | 24 |
| 1 | IIT Delhi - Indian Institute of Technology | Delhi | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 15 | 18 |
| 2 | VIT Vellore | Vellore | 4.2 | Pvt | TNEAVITEEECBSE 12thISCTamilnadu 12th +3 Total | 5 | 7 | 22 |
| 3 | IIT Madras - Indian Institute of Technology | Chennai | 4.6 | NaN | JEE MainJEE AdvancedTotal | 5 | 15 | 25 |
| 4 | DTU - Delhi Technological University | Delhi | 4.3 | NaN | JEE MainDASA UGJAC DelhiCBSE 12thISC +3 Total | 3 | 9 | 28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 506 | JSPM’s Bhivarabai Sawant Institute of Technolo... | Pune | 4.3 | Govt | JEE MainMHT CETTotal | 2 | NaN | 8 |
| 507 | Bansal College of Engineering | Madhya Pradesh - Other | 4.1 | Pvt | JEE MainMP BETotal | 2 | NaN | 6 |
| 508 | University Institute of Technology, Jhabua - R... | Madhya Pradesh - Other | 3.8 | NaN | JEE MainTotal | NaN | NaN | 7 |
| 509 | MET League of Colleges, Bhujbal Knowledge City... | Nashik | 4.2 | NaN | JEE MainMHT CETTotal | 4 | NaN | 4 |
| 510 | RMD Sinhgad School of Engineering | Pune | 3.7 | Govt | JEE MainMHT CETTotal | 3 | NaN | 11 |
511 rows × 8 columns
#Now we saved the data to our directory by using to_csv
College_Data.to_csv("DataProject.csv")
pd.read_csv("DataProject.csv") #agaiun opened in read mode for checking
| Unnamed: 0 | College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | IIT Bombay - Indian Institute of Technology | Mumbai | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8.0 | 8.0 | 24 |
| 1 | 1 | IIT Delhi - Indian Institute of Technology | Delhi | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8.0 | 15.0 | 18 |
| 2 | 2 | VIT Vellore | Vellore | 4.2 | Pvt | TNEAVITEEECBSE 12thISCTamilnadu 12th +3 Total | 5.0 | 7.0 | 22 |
| 3 | 3 | IIT Madras - Indian Institute of Technology | Chennai | 4.6 | NaN | JEE MainJEE AdvancedTotal | 5.0 | 15.0 | 25 |
| 4 | 4 | DTU - Delhi Technological University | Delhi | 4.3 | NaN | JEE MainDASA UGJAC DelhiCBSE 12thISC +3 Total | 3.0 | 9.0 | 28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 506 | 506 | JSPM’s Bhivarabai Sawant Institute of Technolo... | Pune | 4.3 | Govt | JEE MainMHT CETTotal | 2.0 | NaN | 8 |
| 507 | 507 | Bansal College of Engineering | Madhya Pradesh - Other | 4.1 | Pvt | JEE MainMP BETotal | 2.0 | NaN | 6 |
| 508 | 508 | University Institute of Technology, Jhabua - R... | Madhya Pradesh - Other | 3.8 | NaN | JEE MainTotal | NaN | NaN | 7 |
| 509 | 509 | MET League of Colleges, Bhujbal Knowledge City... | Nashik | 4.2 | NaN | JEE MainMHT CETTotal | 4.0 | NaN | 4 |
| 510 | 510 | RMD Sinhgad School of Engineering | Pune | 3.7 | Govt | JEE MainMHT CETTotal | 3.0 | NaN | 11 |
511 rows × 9 columns
#first we are checking missing values
College_Data.isna().sum()
College Name 0 College Location 0 Rating 18 College Type 207 Entrance Exams 0 Expected Fee 12 Average Package 84 Courses Offered 0 dtype: int64
##As we can see that there are 207 missing values in College type's column
#and 18 , 12 , 84 missing values in Rating's column , Expected fee's and Average package's column.
#So by usind pandas statistics functions or approach we are filling the missing values.
#as the data is object , so we imputed the value on the based of mode
College_Data["Rating"].fillna(value = College_Data["Rating"].mode()[0] , inplace=True)
College_Data["College Type"].fillna(value = College_Data["College Type"].mode()[0] , inplace=True)
College_Data["Expected Fee"].fillna(value = College_Data["Expected Fee"].mode()[0] , inplace=True)
College_Data["Average Package"].fillna(value = College_Data["Average Package"].mode()[0] , inplace=True)
College_Data
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 0 | IIT Bombay - Indian Institute of Technology | Mumbai | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 8 | 24 |
| 1 | IIT Delhi - Indian Institute of Technology | Delhi | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 15 | 18 |
| 2 | VIT Vellore | Vellore | 4.2 | Pvt | TNEAVITEEECBSE 12thISCTamilnadu 12th +3 Total | 5 | 7 | 22 |
| 3 | IIT Madras - Indian Institute of Technology | Chennai | 4.6 | Pvt | JEE MainJEE AdvancedTotal | 5 | 15 | 25 |
| 4 | DTU - Delhi Technological University | Delhi | 4.3 | Pvt | JEE MainDASA UGJAC DelhiCBSE 12thISC +3 Total | 3 | 9 | 28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 506 | JSPM’s Bhivarabai Sawant Institute of Technolo... | Pune | 4.3 | Govt | JEE MainMHT CETTotal | 2 | 4 | 8 |
| 507 | Bansal College of Engineering | Madhya Pradesh - Other | 4.1 | Pvt | JEE MainMP BETotal | 2 | 4 | 6 |
| 508 | University Institute of Technology, Jhabua - R... | Madhya Pradesh - Other | 3.8 | Pvt | JEE MainTotal | 2 | 4 | 7 |
| 509 | MET League of Colleges, Bhujbal Knowledge City... | Nashik | 4.2 | Pvt | JEE MainMHT CETTotal | 4 | 4 | 4 |
| 510 | RMD Sinhgad School of Engineering | Pune | 3.7 | Govt | JEE MainMHT CETTotal | 3 | 4 | 11 |
511 rows × 8 columns
#Again checking for missing values
College_Data.isna().sum()
College Name 0 College Location 0 Rating 0 College Type 0 Entrance Exams 0 Expected Fee 0 Average Package 0 Courses Offered 0 dtype: int64
#Now checking for duplicate values using pandas
College_Data.duplicated().sum()
#No duplicate values find
0
Type Casting by checking data info
College_Data.info() #information about data type , values , size , memory etc
<class 'pandas.core.frame.DataFrame'> RangeIndex: 511 entries, 0 to 510 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 College Name 511 non-null object 1 College Location 511 non-null object 2 Rating 511 non-null float64 3 College Type 511 non-null object 4 Entrance Exams 511 non-null object 5 Expected Fee 511 non-null int32 6 Average Package 511 non-null int32 7 Courses Offered 511 non-null int32 dtypes: float64(1), int32(3), object(4) memory usage: 26.1+ KB
#Type casting of data according to their uses
College_Data["Courses Offered"] = College_Data["Courses Offered"].astype(int)
College_Data["Rating"] = College_Data["Rating"].astype(float)
College_Data["Expected Fee"] = College_Data["Expected Fee"].astype(int)
College_Data["Average Package"] = College_Data["Average Package"].astype(int)
Attributes
College_Data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 511 entries, 0 to 510 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 College Name 511 non-null object 1 College Location 511 non-null object 2 Rating 511 non-null float64 3 College Type 511 non-null object 4 Entrance Exams 511 non-null object 5 Expected Fee 511 non-null int32 6 Average Package 511 non-null int32 7 Courses Offered 511 non-null int32 dtypes: float64(1), int32(3), object(4) memory usage: 26.1+ KB
College_Data.describe() #to check data on basis of descriptive statistics
| Rating | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|
| count | 511.000000 | 511.000000 | 511.000000 | 511.00000 |
| mean | 3.967515 | 7.534247 | 5.041096 | 13.95499 |
| std | 0.331842 | 14.227504 | 2.911821 | 10.82897 |
| min | 1.400000 | 1.000000 | 0.000000 | 1.00000 |
| 25% | 3.800000 | 2.000000 | 4.000000 | 7.00000 |
| 50% | 4.000000 | 4.000000 | 4.000000 | 11.00000 |
| 75% | 4.200000 | 6.000000 | 6.000000 | 17.00000 |
| max | 4.800000 | 100.000000 | 23.000000 | 67.00000 |
#Now we are checking the first five data from data frame using df.head
College_Data.head()
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 0 | IIT Bombay - Indian Institute of Technology | Mumbai | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 8 | 24 |
| 1 | IIT Delhi - Indian Institute of Technology | Delhi | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 15 | 18 |
| 2 | VIT Vellore | Vellore | 4.2 | Pvt | TNEAVITEEECBSE 12thISCTamilnadu 12th +3 Total | 5 | 7 | 22 |
| 3 | IIT Madras - Indian Institute of Technology | Chennai | 4.6 | Pvt | JEE MainJEE AdvancedTotal | 5 | 15 | 25 |
| 4 | DTU - Delhi Technological University | Delhi | 4.3 | Pvt | JEE MainDASA UGJAC DelhiCBSE 12thISC +3 Total | 3 | 9 | 28 |
#now checking bottom five data
College_Data.tail()
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 506 | JSPM’s Bhivarabai Sawant Institute of Technolo... | Pune | 4.3 | Govt | JEE MainMHT CETTotal | 2 | 4 | 8 |
| 507 | Bansal College of Engineering | Madhya Pradesh - Other | 4.1 | Pvt | JEE MainMP BETotal | 2 | 4 | 6 |
| 508 | University Institute of Technology, Jhabua - R... | Madhya Pradesh - Other | 3.8 | Pvt | JEE MainTotal | 2 | 4 | 7 |
| 509 | MET League of Colleges, Bhujbal Knowledge City... | Nashik | 4.2 | Pvt | JEE MainMHT CETTotal | 4 | 4 | 4 |
| 510 | RMD Sinhgad School of Engineering | Pune | 3.7 | Govt | JEE MainMHT CETTotal | 3 | 4 | 11 |
#now checking the data type for required columns for visualisation
print(College_Data['College Name'].dtype)
print(College_Data['College Location'].dtype)
print(College_Data['Rating'].dtype)
print(College_Data['College Type'].dtype)
print(College_Data['Entrance Exams'].dtype)
print(College_Data['Expected Fee'].dtype)
print(College_Data['Average Package'].dtype)
print(College_Data['Courses Offered'].dtype)
object object float64 object object int32 int32 int32
#Now checking the size(total no. of value counts)
College_Data.size
4088
#checking shape(rows*columns)
College_Data.shape
(511, 8)
#checking the columns in our dataframe
College_Data.columns
Index(['College Name', 'College Location', 'Rating', 'College Type',
'Entrance Exams', 'Expected Fee', 'Average Package', 'Courses Offered'],
dtype='object')
#checking the index value
College_Data.index
RangeIndex(start=0, stop=511, step=1)
#using dtypes attribute on our dataframe
College_Data.dtypes
College Name object College Location object Rating float64 College Type object Entrance Exams object Expected Fee int32 Average Package int32 Courses Offered int32 dtype: object
Statistical measures
# Finding statistical measures using pandas on our data frame
#mean
College_Data.mean() # only gave mean for the columns which are in int and flaot data type
Rating 3.967515 Expected Fee 7.534247 Average Package 5.041096 Courses Offered 13.954990 dtype: float64
# medain
College_Data.median()
Rating 4.0 Expected Fee 4.0 Average Package 4.0 Courses Offered 11.0 dtype: float64
#Standard deviation
College_Data.std()
Rating 0.331842 Expected Fee 14.227504 Average Package 2.911821 Courses Offered 10.828970 dtype: float64
#Variance
College_Data.var()
Rating 0.110119 Expected Fee 202.421864 Average Package 8.478700 Courses Offered 117.266598 dtype: float64
#skewness
College_Data.skew()
Rating -1.462397 Expected Fee 4.260500 Average Package 2.768802 Courses Offered 2.164579 dtype: float64
#kurtosis
College_Data.kurtosis()
Rating 7.388762 Expected Fee 18.817405 Average Package 9.171644 Courses Offered 6.285409 dtype: float64
Data Manipulation
#using groupby , analysing the data betweeb college type, fee , and package
Gdata = College_Data.groupby(by = ["College Type"])[["Expected Fee","Average Package"]].agg(["sum"])
Gdata
| Expected Fee | Average Package | |
|---|---|---|
| sum | sum | |
| College Type | ||
| Govt | 887 | 655 |
| Pvt | 2963 | 1921 |
#Sorting our dataframe by using .sort on the basis of fees
Rating_Wise = College_Data.sort_values(by = 'Rating' , ascending = False)
Top_10Rating = Rating_Wise.head(10)
Top_10Rating
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 272 | Dr. B.R. Ambedkar Institute of Technology | Port Blair | 4.8 | Pvt | CBSE 12thTotal | 42 | 3 | 7 |
| 92 | International Institute of Information Technol... | Bangalore | 4.7 | Pvt | JEE MainTotal | 22 | 23 | 1 |
| 7 | IIT Kanpur - Indian Institute of Technology | Kanpur | 4.7 | Govt | JEE MainJEE AdvancedTotal | 8 | 19 | 9 |
| 0 | IIT Bombay - Indian Institute of Technology | Mumbai | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 8 | 24 |
| 505 | I K Gujral Punjab Technical University, Hoshia... | Hoshiarpur | 4.6 | Pvt | JEE MainCBSE 12thPSEB 12th +1 Total | 3 | 4 | 6 |
| 240 | BVRIT Hyderabad College of Engineering for Women | Hyderabad | 4.6 | Govt | TS EAMCETTotal | 4 | 6 | 13 |
| 1 | IIT Delhi - Indian Institute of Technology | Delhi | 4.6 | Govt | JEE MainJEE AdvancedTotal | 8 | 15 | 18 |
| 264 | Shri Vishnu Engineering College for Women | West Godavari | 4.6 | Pvt | AP EAMCETCBSE 12thBIEAP +1 Total | 3 | 3 | 12 |
| 482 | Gyan Sagar College of Engineering | Sagar | 4.6 | Pvt | JEE MainMP BECBSE 12thMPBSE 12th +2 Total | 2 | 4 | 7 |
| 492 | Sagar Institute of Science, Technology and Eng... | Bhopal | 4.6 | Pvt | JEE MainMP BETotal | 2 | 4 | 41 |
#Sorting our dataframe by using .sort on the basis of Average package
Package_Wise = College_Data.sort_values(by = 'Average Package' , ascending = False)
Top_10Package = Package_Wise.head(10)
Top_10Package
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 92 | International Institute of Information Technol... | Bangalore | 4.7 | Pvt | JEE MainTotal | 22 | 23 | 1 |
| 7 | IIT Kanpur - Indian Institute of Technology | Kanpur | 4.7 | Govt | JEE MainJEE AdvancedTotal | 8 | 19 | 9 |
| 19 | Indian Institute of Technology, BHU, Varanasi | Varanasi | 4.3 | Pvt | JEE MainJEE AdvancedTotal | 9 | 18 | 26 |
| 16 | IIT Guwahati - Indian Institute of Technology | Guwahati | 4.5 | Govt | JEE MainJEE AdvancedAssam CEE +1 Total | 8 | 18 | 11 |
| 11 | BITS Pilani - Birla Institute of Technology an... | Pilani | 4.4 | Govt | BITSATTotal | 20 | 18 | 8 |
| 28 | IIT Mandi - Indian Institute of Technology | Mandi | 4.3 | Pvt | JEE MainJEE AdvancedTotal | 8 | 18 | 51 |
| 58 | IIIT Delhi - Indraprastha Institute of Informa... | Delhi | 4.5 | Pvt | JEE MainDASA UGJAC DelhiUCEED +2 Total | 17 | 17 | 13 |
| 6 | IIT Kharagpur - Indian Institute of Technology | Kharagpur | 4.5 | Govt | JEE MainJEE AdvancedTotal | 8 | 17 | 18 |
| 14 | IIT Hyderabad - Indian Institute of Technology | Hyderabad | 4.4 | Pvt | JEE MainJEE AdvancedTS EAMCET +1 Total | 8 | 16 | 25 |
| 361 | IIT Jammu - Indian Institute of Technology | Jammu | 4.3 | Pvt | JEE MainJEE AdvancedTotal | 8 | 16 | 11 |
Fee_Wise = College_Data.sort_values(by = 'Expected Fee' , ascending = False)
Top_10fee = Fee_Wise.head(10)
Top_10fee
| College Name | College Location | Rating | College Type | Entrance Exams | Expected Fee | Average Package | Courses Offered | |
|---|---|---|---|---|---|---|---|---|
| 101 | Panjab University | Chandigarh | 3.8 | Govt | JEE MainCBSE 12thPSEB 12th +1 Total | 100 | 4 | 14 |
| 458 | Indira Gandhi Engineering College (IGEC) | Jabalpur | 3.4 | Pvt | JEE MainMP BETotal | 97 | 4 | 2 |
| 409 | University Institute of Technology, Bhopal - R... | Bhopal | 3.6 | Pvt | JEE MainCBSE 12thMPBSE 12thMP BE +2 Total | 89 | 3 | 17 |
| 140 | J. C. Bose University of Science and Technolog... | Faridabad | 3.7 | Govt | JEE MainHSTESCBSE 12thHBSE 12th +2 Total | 86 | 4 | 3 |
| 277 | Cochin University of Science and Technology, K... | Kochi | 4.1 | Pvt | CUSAT CATCATTotal | 86 | 5 | 9 |
| 144 | PES College of Engineering | Karnataka - Other | 3.9 | Pvt | COMEDK UGETKCETCBSE 12thKarnataka 2nd PUC +2 T... | 84 | 4 | 7 |
| 298 | R.N.S. Institute of Technology | Bangalore | 3.8 | Pvt | JEE MainCOMEDK UGETKCETDCETCBSE 12thKarnataka ... | 78 | 4 | 13 |
| 44 | College of Engineering, Pune | Pune | 4.3 | Pvt | JEE MainMHT CETTotal | 75 | 8 | 12 |
| 317 | Central Institute of Technology | Assam - Other | 3.9 | Pvt | JEE MainJEE AdvancedAssam CEE +1 Total | 72 | 3 | 10 |
| 365 | UoH - University of Hyderabad | Hyderabad | 4.2 | Pvt | JEE MainTS EAMCETTotal | 72 | 5 | 9 |
#Now analysing the data using pivot table
City_Wise = College_Data.pivot_table(values = ["Expected Fee","Average Package"] , index = "College Type" , columns = "College Location")
City_Wise
| Average Package | ... | Expected Fee | |||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| College Location | Agartala | Agra | Ahmedabad | Ahmednagar | Aizawl | Ajmer | Aligarh | Allahabad | Ambala | Amethi | ... | Vadodara | Varanasi | Vellore | Vidisha | Vijayawada | Virudhunagar | Visakhapatnam | Warangal | Wardha | West Godavari |
| College Type | |||||||||||||||||||||
| Govt | 4.0 | NaN | 3.0 | 3.0 | 5.0 | NaN | NaN | 9.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | 4.0 | NaN | NaN |
| Pvt | 4.0 | 3.0 | 7.0 | 4.0 | 6.0 | 4.0 | 5.0 | 7.5 | 5.0 | 10.0 | ... | 6.0 | 9.0 | 5.0 | 1.0 | NaN | 3.0 | 2.333333 | 5.0 | 3.0 | 3.0 |
2 rows × 332 columns
College_Data.pivot_table(values = ["Expected Fee","Average Package"] , index = "College Location" , columns = "College Type")
| Average Package | Expected Fee | |||
|---|---|---|---|---|
| College Type | Govt | Pvt | Govt | Pvt |
| College Location | ||||
| Agartala | 4.0 | 4.0 | 4.0 | 8.000000 |
| Agra | NaN | 3.0 | NaN | 19.500000 |
| Ahmedabad | 3.0 | 7.0 | 6.0 | 8.000000 |
| Ahmednagar | 3.0 | 4.0 | 3.0 | 3.000000 |
| Aizawl | 5.0 | 6.0 | 1.0 | 5.000000 |
| ... | ... | ... | ... | ... |
| Virudhunagar | NaN | 3.0 | NaN | 3.000000 |
| Visakhapatnam | NaN | 4.0 | NaN | 2.333333 |
| Warangal | 4.0 | 7.0 | 4.0 | 5.000000 |
| Wardha | NaN | 4.0 | NaN | 3.000000 |
| West Godavari | NaN | 3.0 | NaN | 3.000000 |
166 rows × 4 columns
Data Visualization
#BAR CHART FOR RATINGS
Data1 = College_Data["Rating"].value_counts()
Data1.plot.bar(width = 0.4 , color = ["red" , "green" , "yellow" , "pink" , "orange"])
plt.legend()
plt.title("RATING OF COLLEGES")
plt.xlabel("Rating")
plt.ylabel("No. of Colleges")
plt.show()
Observation: MOstly colleges having a good rating that is above 4.0
# PIE CHART OF TOP TEN COLLEGES ON FEE
Data6 = Top_10fee["Rating"].value_counts()
Data6.plot.pie(autopct='%.0f%%')
plt.title("Ratings of top 10 colleges")
plt.legend(fontsize=5)
plt.show()
Observation: From 10 ten colleges , 10% colleges having rating of 4.3 which is maximum
sns.countplot(x = "College Type", data=College_Data)
plt.title("Type of colleges")
plt.xlabel("Type of College")
plt.ylabel("No. of Colleges")
plt.show()
Observation: there are 110 govt and 390 are private colleges in the count of 500
#plotting histogram for analysing the columns of data frame
College_Data['Courses Offered'].hist()
<Axes: >
Observation: More than 200 colleges are offerering 10 courses
College_Data['Rating'].hist()
<Axes: >
Observation: there are more than 250 colleges which are having a good rating of 4.0
College_Data['Average Package'].hist()
<Axes: >
Observation: There are more than 300 colleges who is giving placement of minimum 4 lpa
Top_10Package['Average Package'].hist()
<Axes: >
Observation : Top 10 colleges giving placement in between 16 to 23 lpa
plt.scatter(College_Data["Average Package"] , College_Data["Rating"] , color = "red")
<matplotlib.collections.PathCollection at 0x256842b4e10>
Obbservation: We are having a good correlation between Rating and package
## Hist plot of packages and fee
sns.histplot(College_Data["Average Package"])
plt.xlabel("Annual Package")
plt.ylabel("Distribution")
plt.show()
Observation: Maximum colleges are giving placement of 4 lpa
BIVARIATE ANALYSIS
i) Numerical & Numerical a. Scatterplot b. Line plot c. Heatmap for correlation d. Joint plot ii) Numerical & Categorical a. Bar chart b. Violin plot iii) Categorical & Categorical a. Bar chart b. Grouped bar chart
#Visualizing the data using scatter plot between Fee and package
plt.scatter(Top_10Package["Average Package"] , Top_10Package["College Name"] , color = "red")
<matplotlib.collections.PathCollection at 0x256f708ce10>
Observation: All IIT's are having a good package
sns.heatmap(College_Data.corr() , annot = True , cmap = "Set3" , linecolor = "black" , linewidth = 4)
plt.show()
Observation: There is a good correlation between Rating , fees , package.
fig , ax = plt.subplots(figsize = (15,8))
pd.crosstab(College_Data["College Location"][0:50] , College_Data["College Type"][0:50]).plot(kind = "bar" , ax = ax , width = 0.6 , stacked = True)
plt.title('City vs Type of college')
plt.legend(bbox_to_anchor = (1,1))
<matplotlib.legend.Legend at 0x256f77d5610>
Multivariate Analysis
# Now plotting the Profile Report of our whole data
Profile = ProfileReport(College_Data , title = 'College Data')
Profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
# saving this file in HTML
Profile.to_file("DATAPROJECT.html")
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
## Publishing the data in tableau form
pg.walk(College_Data)